library(tidyverse)
library(magrittr)
library(glue)
library(Seurat)
library(grid)
library(biomaRt)
library(SingleR)
library(batchtools)
# stats
library(lme4)
library(broom)
# parallelization
library(BiocParallel)
library(future)
library(furrr)
# plotting
library(RColorBrewer)
library(ggsci)
library(Matrix)
library(plotly)
library(DT)
library(gtsummary)
library(aplot)
library(patchwork)
library(viridis)
library(vegan)
library(ggpubr)
library(ggside)
library(ggsci)
# setting paths
homedir <- "/central/groups/mthomson/jboktor"
wkdir <- glue("{homedir}/spatial_genomics/jess_2024-01-23")
source(glue("{wkdir}/notebooks/R_scripts/_misc_functions.R"))
# 12gb limit (1500*1024^2 = 1572864000) * 8
options(future.globals.maxSize= 12582912000)
future::plan("multisession", workers = 32)WildR Brain Tissue Spatial Transcriptomics Analysis
Background
Here we analyze
Analysis Setup
Environment setup.
Summary stats of data
abca_color_pal <- readRDS(glue("{wkdir}/data/interim/abca_color_pal.rds"))
merged_rois <- readRDS(
glue("{wkdir}/data/interim/merged_roi_seurat_2024-07-15.rds")
)
# Adding cell annots to object ----
preds_combined <- readRDS(
glue(
"{wkdir}/data/interim/alignments/",
"2024-07-15_ABCA-1&2_SingleR_predictions_combined.rds"
)
)
# adding singleR labels into seurat object
merged_rois$singleR_labels <-
preds_combined$labels[
match(rownames(merged_rois@meta.data), rownames(preds_combined))
]
merged_rois$singleR_labels_refined <-
preds_combined$pruned.labels[
match(rownames(merged_rois@meta.data), rownames(preds_combined))
]
# Adding staNMF data to object ---
PPs_r <- readRDS(
glue("{wkdir}/data/interim/osNMF/osNMF_PPs_K11_2024-07-15.rds")
)
merged_rois <- Seurat::AddMetaData(
object = merged_rois,
metadata = PPs_r
)
merged_rois$slice_id <-
glue("{toupper(merged_rois$group)} {merged_rois$run} {merged_rois$roi}")
meta_df <- merged_rois@meta.data %>% glimpse()Goal is to remove problematic cells characterized by 1. Annotation with a cell type not likely to exist in this bregma region 2. High distortion signal from serial genes (high osNMF 1 signal)
Key questions; 1. Are the cells that are highly represented in osNMF1 only influenced by issues with serial genes, or are there more general issues - caused by image focus. - Are high osNMF1 cells enriched for cell counts or altered in alpha diversity metrics compared to other cells of the same class that are low in osNMF 1
Visualizing cells in osNMF1 at various thresholds
threshold_list <- seq(0, 0.2, by = 0.025)
for (thesh in threshold_list){
message(glue("Plotting: {thesh}"))
p <- merged_rois@meta.data %>%
rownames_to_column(var = "cell_id") %>%
filter(osNMF_1 > thesh) %>%
arrange(osNMF_1) %>%
mutate(cell_id, factor(cell_id)) %>%
arrange(group) %>%
mutate(slice_id, factor(slice_id)) %>%
ggplot(aes(center_x, center_y, color = osNMF_1)) +
geom_point(size = 0.15) +
scale_color_viridis() +
coord_fixed() +
scale_y_reverse() +
facet_wrap(~slice_id, ncol = 4) +
theme_bw()
ggsave(
glue("{wkdir}/figures/osNMF_K11/osNMF1_thresholds/",
"K11_osNMF1_{thesh}_{Sys.Date()}.png"),
p,
width = 16, height = 16
)
}Remove cells that are are below x total counts in the reference set
For each cell, the Simpson’s Evenness index is calculated as follows:
- Calculate the Simpson’s diversity index \(D\):
\(D = \sum_{i=1}^{S} p_i^2\)
where \(p_i\) is the proportion of the total count for genes \(i\) and \(S\) is the total number of genes.
- Calculate the number of genes \(S\):
\(S = \text{specnumber}\)
- Calculate the Simpson’s Evenness index $ E $:
\(E = \frac{D}{\log(S)}\)
where \(\log\) is the natural logarithm.
# Define a function to calculate Simpson's Evenness for a single cell
calculate_simpsons_evenness_per_cell <- function(cell_counts) {
simpson_index <- vegan::diversity(cell_counts, index = "simpson")
species_count <- vegan::specnumber(cell_counts)
evenness <- simpson_index / log(species_count)
return(evenness)
}
counts <- as.matrix(merged_rois@assays$SCT@counts)
counts %>% dim
# Apply the function to each cell
simpsons_evenness <- t(counts) %>%
calculate_simpsons_evenness_per_cell()
merged_rois <- AddMetaData(merged_rois,
metadata = simpsons_evenness,
col.name = "simpsons_evenness"
)
meta_df <- merged_rois@meta.data %>% glimpse()
quantiles <- quantile(meta_df$simpsons_evenness,
probs = seq(0, 1, by = 0.01)) %>%
unique()
meta_df %<>%
mutate(simpsons_evenness_quantiles = cut(
simpsons_evenness,
breaks = quantiles,
labels = rev(paste0("Q", seq(1, length(quantiles) - 1))),
include.lowest = TRUE,
right = FALSE
)) %>%
mutate(high_evenness =
if_else(simpsons_evenness_quantiles == "Q1", TRUE, FALSE)) %>%
glimpse()
# save
meta_df %>%
select(simpsons_evenness, simpsons_evenness_quantiles, high_evenness) %>%
saveRDS(
glue("{wkdir}/data/interim/simpsons_evenness_{Sys.Date()}.rds")
)meta_df$high_evenness %>% table
p_nmf1_v_evenness <- meta_df %>%
ggplot(aes(simpsons_evenness, osNMF_1)) +
geom_point(size = 0.2, alpha = 0.3, aes(color = high_evenness)) +
stat_cor(label.x = 0.14, label.y = 0.21) +
stat_regline_equation(label.x = 0.14, label.y = 0.2) +
ggside::geom_xsidedensity(aes(y = after_stat(density))) +
geom_ysidedensity(aes(x = after_stat(density))) +
labs(x = "Simpson's Evenness", y = "osNMF_1") +
scale_color_d3() +
theme_bw() +
theme(ggside.panel.scale.x = .2, ggside.panel.scale.y = .2) +
scale_ysidex_continuous(guide = guide_axis(angle = 90), minor_breaks = NULL)
ggsave(
glue("{wkdir}/figures/osNMF_K11/osNMF1_vs_evenness_{Sys.Date()}.png"),
p_nmf1_v_evenness,
width = 7, height = 7
)
p_nmf1_v_evenness_immune <- meta_df %>%
filter(singleR_labels == "34 Immune") %>%
ggplot(aes(simpsons_evenness, osNMF_1)) +
geom_point(size = 0.2, alpha = 0.3, aes(color = high_evenness)) +
stat_cor(label.x = 0.14, label.y = 0.21) +
stat_regline_equation(label.x = 0.14, label.y = 0.2) +
ggside::geom_xsidedensity(aes(y = after_stat(density))) +
geom_ysidedensity(aes(x = after_stat(density))) +
labs(x = "Simpson's Evenness", y = "osNMF_1") +
theme_bw() +
scale_color_d3() +
theme(ggside.panel.scale.x = .2, ggside.panel.scale.y = .2) +
scale_ysidex_continuous(guide = guide_axis(angle = 90), minor_breaks = NULL)
ggsave(
glue("{wkdir}/figures/osNMF_K11/osNMF1_vs_evenness_34-Immune_{Sys.Date()}.png"),
p_nmf1_v_evenness_immune,
width = 7, height = 7
)Cells in our osNMF1 dimension show a positive correlation with simpsons evenness, suggesting that these cells are not defined by high expression of a single gene but rather multiple genes. Previous analysis suggests this NMF dimension capture serial gene expression by looking at the weights of genes.
Filtering unlikely cell-types
ref_meta %>% glimpse
ref_meta %>%
group_by(class) %>%
summarize(n = n()) %>%
arrange(n) %>%
View()
ref_meta %>%
group_by(class) %>%
summarize(n = n()) %>%
filter(n < 600) %>%
pull(class)
joined_relab_stats %>%
filter(n_mean_ref < 100) %>% View
pull(singleR_labels) %>%
unique()
#' Black list was determined by manually inspecting the data and identifying
#' low abundance cell types that are disporportionately abundant in our seqFISH dataset
black_list <- c(
"10 LSX GABA",
"15 HY Gnrh1 Glut",
"22 MB-HB Sero",
"23 P Glut",
"24 MY Glut",
"25 Pineal Glut",
"26 P GABA",
"27 MY GABA",
"28 CB GABA",
"29 CB Glut",
"32 OEC"
)Filtering Data
Criteria for filtering:
- Cells with osNMF 1 > 0.125
- Cells with simpsons evenness score in the top 1% of the distribution
- Annotated as unlikely cell types (blacklist)
# add evenness score to metadata
evenness_meta <- readRDS(
glue("{wkdir}/data/interim/simpsons_evenness_2024-07-22.rds")
) %>% glimpse
merged_rois$high_evenness <- evenness_meta$high_evenness
lowq_cells <- merged_rois %>%
subset(
singleR_labels %in% black_list |
osNMF_1 >= 0.125 |
high_evenness
)
# printing a plot of low quality cells
p_lowq <- lowq_cells@meta.data %>%
ggplot(aes(center_x, center_y, color = singleR_labels)) +
geom_point(size = 0.1) +
scale_color_manual(values = abca_color_pal[["class_colors"]]) +
coord_fixed() +
scale_y_reverse() +
facet_wrap(~slice_id, ncol = 4) +
guides(colour = guide_legend(override.aes = list(size=3))) +
theme_bw()
ggsave(
glue("{wkdir}/figures/low-quality-cells_{Sys.Date()}.png"),
p_lowq,
width = 16, height = 16
)From 1,076,046 cells to 977,050